Skip to content

Commit e9ee3f9

Browse files
committed
Synchronize with llama.cpp upstream
1 parent a22dd04 commit e9ee3f9

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

65 files changed

+29417
-21475
lines changed

llama.cpp/common.cpp

Lines changed: 1611 additions & 1412 deletions
Large diffs are not rendered by default.

llama.cpp/common.h

Lines changed: 225 additions & 106 deletions
Large diffs are not rendered by default.

llama.cpp/ggml-aarch64.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -317,7 +317,7 @@ static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict ds
317317
for (int64_t x = 0; x < nb; x++) {
318318

319319
for (int i = 0; i < nrows_interleaved; i++ ) {
320-
quantize_row_q4_0_reference(src + b + i * n_per_row + x * QK4_0, (block_q4_0 *) dst_tmp + i, QK4_0);
320+
quantize_row_q4_0_ref(src + b + i * n_per_row + x * QK4_0, (block_q4_0 *) dst_tmp + i, QK4_0);
321321
}
322322

323323
if (nrows_interleaved == 8) {

llama.cpp/ggml-alloc.c

Lines changed: 100 additions & 49 deletions
Large diffs are not rendered by default.

llama.cpp/ggml-backend-impl.h

Lines changed: 25 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,15 @@ extern "C" {
1919

2020
struct ggml_backend_buffer_type_i {
2121
const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
22+
// allocate a buffer of this type
2223
ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
23-
size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
24-
size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft); // allocation max size
25-
size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
26-
bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
24+
// tensor alignment
25+
size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft);
26+
// max buffer size that can be allocated
27+
size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft);
28+
// data size needed to allocate the tensor, including padding
29+
size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
2730
// check if tensor data is in host memory
28-
// should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
2931
bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft);
3032
};
3133

@@ -94,27 +96,37 @@ extern "C" {
9496
void (*GGML_CALL synchronize)(ggml_backend_t backend);
9597

9698
// compute graph with a plan (not used currently)
99+
// create a new plan for a graph
97100
ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
98101
void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
102+
// update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
103+
void (*GGML_CALL graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph);
104+
// compute the graph with the plan
105+
enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
99106

100-
// compute graph with a plan
101-
enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
102107
// compute graph without a plan (async)
103108
enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
104109

105-
// check if the backend supports an operation
110+
// check if the backend can compute an operation
106111
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
107112

113+
// check if the backend can use tensors allocated in a buffer type
114+
bool (*GGML_CALL supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
115+
108116
// check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
109117
// these should be expensive operations with large batch sizes that may benefit from running on this backend
110118
// even if the weight has to be copied from the CPU temporarily
111119
bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
112120

113121
// (optional) event synchronization
122+
// create a new event that can record events on this backend instance
114123
ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
115124
void (*GGML_CALL event_free) (ggml_backend_event_t event);
125+
// record an event on the backend instance that created it
116126
void (*GGML_CALL event_record) (ggml_backend_event_t event);
127+
// wait for an event on on a different backend instance
117128
void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
129+
// block until an event is recorded
118130
void (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
119131
};
120132

@@ -163,7 +175,7 @@ extern "C" {
163175
void (*GGML_CALL ggml_backend_tensor_set)(struct ggml_tensor *, const void *, size_t, size_t);
164176
bool (*GGML_CALL ggml_is_quantized)(enum ggml_type);
165177
size_t (*GGML_CALL ggml_type_size)(enum ggml_type);
166-
int (*GGML_CALL ggml_blck_size)(enum ggml_type);
178+
int64_t (*GGML_CALL ggml_blck_size)(enum ggml_type);
167179
bool (*GGML_CALL ggml_is_transposed)(const struct ggml_tensor *);
168180
size_t (*GGML_CALL ggml_nbytes)(const struct ggml_tensor *);
169181
enum ggml_unary_op (*GGML_CALL ggml_get_unary_op)(const struct ggml_tensor *);
@@ -180,7 +192,11 @@ extern "C" {
180192
bool (*GGML_CALL ggml_backend_buffer_is_host)(ggml_backend_buffer_t);
181193
bool (*GGML_CALL ggml_guid_matches)(ggml_guid_t, ggml_guid_t);
182194
bool (*GGML_CALL ggml_is_empty)(const struct ggml_tensor *);
195+
enum ggml_backend_buffer_usage (*GGML_CALL ggml_backend_buffer_get_usage)(ggml_backend_buffer_t);
183196
bool (*GGML_CALL ggml_are_same_shape)(const struct ggml_tensor *, const struct ggml_tensor *);
197+
void (*GGML_CALL ggml_abort)(const char *, int, const char *, ...);
198+
bool (*GGML_CALL ggml_is_contiguous_1)(const struct ggml_tensor *);
199+
bool (*GGML_CALL ggml_is_contiguous_2)(const struct ggml_tensor *);
184200
};
185201

186202
#ifdef __cplusplus

0 commit comments

Comments
 (0)